# a simple modelgraded eval checking if a completion is funny or not
joke-fruits:
  id: joke-fruits.dev.v0
  metrics: [accuracy]
joke-fruits.dev.v0:
  class: evals.elsuite.modelgraded.classify:ModelBasedClassify
  args:
    samples_jsonl: test_modelgraded/joke_fruits.jsonl
    eval_type: cot_classify
    modelgraded_spec: humor

# (same eval as above, but with format_type="out_message")
joke-fruits-v2:
  id: joke-fruits-v2.dev.v0
  metrics: [accuracy]
joke-fruits-v2.dev.v0:
  class: evals.elsuite.modelgraded.classify:ModelBasedClassify
  args:
    samples_jsonl: test_modelgraded/joke_fruits.jsonl
    eval_type: cot_classify
    modelgraded_spec: humor_out_message

# (same eval as above, but with likert scale of 1-5)
joke-fruits-likert:
  id: joke-fruits-likert.dev.v0
  metrics: [accuracy]
joke-fruits-likert.dev.v0:
  class: evals.elsuite.modelgraded.classify:ModelBasedClassify
  args:
    samples_jsonl: test_modelgraded/joke_fruits.jsonl
    eval_type: cot_classify
    modelgraded_spec: humor_likert

# a meta-evaluation of a modelgraded eval checking if a completion is funny or not
# this example uses a labeled dataset with "completion" and "choice"
joke-fruits-meta:
  id: joke-fruits-meta.dev.v0
  metrics: [accuracy]
joke-fruits-meta.dev.v0:
  class: evals.elsuite.modelgraded.classify:ModelBasedClassify
  args:
    samples_jsonl: test_metaeval/joke_fruits_labeled.jsonl
    eval_type: cot_classify
    modelgraded_spec: humor
    metaeval: true

# (above, but with "answer then explain", rather than "reason then answer")
joke-fruits-expl-meta:
  id: joke-fruits-expl-meta.dev.v0
  metrics: [accuracy]
joke-fruits-expl-meta.dev.v0:
  class: evals.elsuite.modelgraded.classify:ModelBasedClassify
  args:
    samples_jsonl: test_metaeval/joke_fruits_labeled.jsonl
    eval_type: classify_cot
    modelgraded_spec: humor
    metaeval: true

# (above, but with "answer" only)
joke-fruits-ans-meta:
  id: joke-fruits-ans-meta.dev.v0
  metrics: [accuracy]
joke-fruits-ans-meta.dev.v0:
  class: evals.elsuite.modelgraded.classify:ModelBasedClassify
  args:
    samples_jsonl: test_metaeval/joke_fruits_labeled.jsonl
    eval_type: classify
    modelgraded_spec: humor
    metaeval: true

# a simple modelgraded eval checking if 4 completions to the sample prompt is diverse
diversity:
  id: diversity.dev.v0
  metrics: [accuracy]
diversity.dev.v0:
  class: evals.elsuite.modelgraded.classify:ModelBasedClassify
  args:
    samples_jsonl: test_modelgraded/joke_fruits.jsonl
    eval_type: cot_classify
    modelgraded_spec: diversity
    multicomp_n: 4
    sample_kwargs:
      temperature: 0.4

# a simple modelgraded eval checking which of N completions to the sample prompt is the best response
# command: `oaieval gpt-3.5-turbo,gpt-4,gpt-3.5-turbo best`
best:
  id: best.dev.v0
  metrics: [accuracy]
best.dev.v0:
  class: evals.elsuite.modelgraded.classify:ModelBasedClassify
  args:
    samples_jsonl: test_modelgraded/joke_fruits.jsonl
    eval_type: cot_classify
    modelgraded_spec: best
    multicomp_n: from_models
    sample_kwargs:
      temperature: 0.4
